In [41]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import math
from sklearn.cross_validation import train_test_split,KFold
import numpy as np

In [42]:
datafile_train=r'carvan_train.csv'
datafile_test=r'carvan_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)

In [43]:
len(cd_train)


Out[43]:
5822

In [44]:
cd_train.head(5)


Out[44]:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 ... V77 V78 V79 V80 V81 V82 V83 V84 V85 V86
0 33 1 3 2 8 0 5 1 3 7 ... 0 0 0 1 0 0 0 0 0 0
1 37 1 2 2 8 1 4 1 4 6 ... 0 0 0 1 0 0 0 0 0 0
2 37 1 2 2 8 0 4 2 4 3 ... 0 0 0 1 0 0 0 0 0 0
3 9 1 3 3 3 2 3 2 4 5 ... 0 0 0 1 0 0 0 0 0 0
4 40 1 4 2 10 1 4 1 4 7 ... 0 0 0 1 0 0 0 0 0 0

5 rows × 86 columns


In [45]:
x = cd_train.drop(['V86'],1)
y = cd_train['V86']

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report

Optimizing model...

Run train_test splits on the train data


In [47]:
ld_train, ld_test = train_test_split(cd_train, test_size=0.2, random_state=2)

In [48]:
x80_train = ld_train.drop(['V86'],1)
y80_train = ld_train['V86']

x20_test = ld_test.drop(['V86'],1)
y20_test = ld_test['V86']

1. Check ROC_AUC_SCORE {penalty='l1', class_weight=None}


In [49]:
model_logr1 = LogisticRegression(penalty="l1",class_weight=None,random_state=2)

In [50]:
model_logr1.fit(x80_train, y80_train)


Out[50]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [70]:
y20_test_pred1 = np.where(model_logr1.predict(x20_test)==1,1,0)
temp_df1 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred1))), columns=['V1','V86'])

y_test_pred1 = temp_df1['V86']

In [72]:
roc_auc_score(y20_test, y_test_pred1)


Out[72]:
0.50574923547400619

2. Check ROC_AUC_SCORE {penalty='l2', class_weight=None}


In [53]:
model_logrl2 = LogisticRegression(penalty="l2",class_weight=None,random_state=2)

In [54]:
model_logrl2.fit(x80_train, y80_train)


Out[54]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [57]:
y20_test_pred2 = np.where(model_logrl2.predict(x20_test)==1,1,0)
temp_df2 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred2))), columns=['V1','V86'])

y_test_pred2 = temp_df2['V86']

In [58]:
roc_auc_score(y20_test, y_test_pred2)


Out[58]:
0.50574923547400619

3. Check ROC_AUC_SCORE {penalty='l1', class_weight='balanced'}


In [59]:
model_logr3 = LogisticRegression(penalty="l1",class_weight="balanced",random_state=2)

In [60]:
model_logr3.fit(x80_train, y80_train)


Out[60]:
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=2,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [61]:
y20_test_pred3 = np.where(model_logr3.predict(x20_test)==1,1,0)
temp_df3 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred3))), columns=['V1','V86'])

y_test_pred3 = temp_df3['V86']

In [62]:
roc_auc_score(y20_test, y_test_pred3)


Out[62]:
0.67596330275229366

4. Check ROC_AUC_SCORE {penalty='l2', class_weight='balanced'}


In [116]:
model_logr4 = LogisticRegression(penalty="l2",class_weight="balanced",random_state=2, solver="newton-cg")

In [117]:
model_logr4.fit(x80_train, y80_train)


Out[117]:
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=2,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

In [118]:
y20_test_pred4 = np.where(model_logr4.predict(x20_test)==1,1,0)
temp_df4 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred4))), columns=['V1','V86'])

y_test_pred4 = temp_df4['V86']

In [119]:
roc_auc_score(y20_test, y_test_pred4)


Out[119]:
0.67688073394495418

In [124]:
prob_score=pd.Series(list(zip(*model_logr4.predict_proba(x80_train)))[1])

2. Calculate optimum FBeta score

a. Calculate cutoffs and best KS


In [127]:
cutoffs=np.linspace(0,1,100)

For each of these cutoff , we are going to look at TP,FP,TN,FN values and calculate KS. Then we'll chose the best cutoff as the one having highest KS.


In [129]:
KS_cut=[]
for cutoff in cutoffs:
    predicted = pd.Series([0]*len(y80_train))
    predicted[prob_score > cutoff] = 1
    df = pd.DataFrame(list(zip(y80_train,predicted)),columns=["real","predicted"])
    TP=len(df[(df["real"]==1) &(df["predicted"]==1) ])
    FP=len(df[(df["real"]==0) &(df["predicted"]==1) ])
    TN=len(df[(df["real"]==0) &(df["predicted"]==0) ])
    FN=len(df[(df["real"]==1) &(df["predicted"]==0) ])
    P=TP+FN
    N=TN+FP
    KS=(TP/P)-(FP/N)
    KS_cut.append(KS)

cutoff_data=pd.DataFrame(list(zip(cutoffs,KS_cut)),columns=["cutoff","KS"])

KS_cutoff=cutoff_data[cutoff_data["KS"]==cutoff_data["KS"].max()]["cutoff"]

Now we'll see how this model with the cutoff determined here , performs on the test data.


In [132]:
# Performance on test data
prob_score_test=pd.Series(list(zip(*model_logr4.predict_proba(x20_test)))[1])

predicted_test=pd.Series([0]*len(y20_test))
predicted_test[prob_score_test>float(KS_cutoff)]=1

df_test=pd.DataFrame(list(zip(y20_test,predicted_test)),columns=["real","predicted"])

k=pd.crosstab(df_test['real'],df_test["predicted"])
print('confusion matrix :\n \n ',k)
TN=k.iloc[0,0]
TP=k.iloc[1,1]
FP=k.iloc[0,1]
FN=k.iloc[1,0]
P=TP+FN
N=TN+FP


confusion matrix :
 
  predicted    0    1
real               
0          833  257
1           34   41

In [136]:
# Accuracy of test
accuracy = (TP+TN)/(P+N)
# Sensitivity on test
sensitivity = TP/P
#Specificity on test
specificity = TN/N

print("accuracy : ", accuracy)
print("sensitivity : ", sensitivity)
print("specificity : ", specificity)


accuracy :  0.750214592275
sensitivity :  0.546666666667
specificity :  0.764220183486

Next we see how cutoff determined by F_beta score performs on test data for beta values : 0.5,1,2


In [141]:
cutoffs=np.linspace(0.010,0.99,100)
def Fbeta_perf(beta,cutoffs,y80_train,prob_score):
    FB_cut=[]
    for cutoff in cutoffs:
        predicted=pd.Series([0]*len(y80_train))
        predicted[prob_score>cutoff]=1
        df=pd.DataFrame(list(zip(y80_train,predicted)),columns=["real","predicted"])

        TP=len(df[(df["real"]==1) &(df["predicted"]==1) ])
        FP=len(df[(df["real"]==0) &(df["predicted"]==1) ])
        FN=len(df[(df["real"]==1) &(df["predicted"]==0) ])
        P=TP+FN
        
        
        Precision=TP/(TP+FP)
        Recall=TP/P
        FB=(1+beta**2)*Precision*Recall/((beta**2)*Precision+Recall)
        FB_cut.append(FB)

    cutoff_data=pd.DataFrame(list(zip(cutoffs,FB_cut)),columns=["cutoff","FB"])

    FB_cutoff=cutoff_data[cutoff_data["FB"]==cutoff_data["FB"].max()]["cutoff"]

    prob_score_test=pd.Series(list(zip(*model_logr4.predict_proba(x20_test)))[1])

    predicted_test=pd.Series([0]*len(y20_test))
    predicted_test[prob_score_test>float(FB_cutoff)]=1

    df_test=pd.DataFrame(list(zip(y20_test,predicted_test)),columns=["real","predicted"])

    k=pd.crosstab(df_test['real'],df_test["predicted"])
#     print('confusion matrix :\n \n ',k)
    TN=k.iloc[0,0]
    TP=k.iloc[1,1]
    FP=k.iloc[0,1]
    FN=k.iloc[1,0]
    P=TP+FN
    N=TN+FP
    print('For beta :',beta)
    print('Accuracy is :',(TP+TN)/(P+N))
    print('Sensitivity is :',(TP/P))
    print('Specificity is :',(TN/N))
    print('\n \n \n')

In [178]:
Fbeta_perf(0.5,cutoffs,y80_train,prob_score)
Fbeta_perf(1,cutoffs,y80_train,prob_score)
Fbeta_perf(1.5,cutoffs,y80_train,prob_score)
Fbeta_perf(2,cutoffs,y80_train,prob_score)
Fbeta_perf(2.5,cutoffs,y80_train,prob_score)
Fbeta_perf(3.0,cutoffs,y80_train,prob_score)


For beta : 0.5
Accuracy is : 0.915021459227
Sensitivity is : 0.226666666667
Specificity is : 0.962385321101

 
 

For beta : 1
Accuracy is : 0.843776824034
Sensitivity is : 0.426666666667
Specificity is : 0.87247706422

 
 

For beta : 1.5
Accuracy is : 0.781115879828
Sensitivity is : 0.506666666667
Specificity is : 0.8

 
 

For beta : 2
Accuracy is : 0.763090128755
Sensitivity is : 0.533333333333
Specificity is : 0.778899082569

 
 

For beta : 2.5
Accuracy is : 0.763090128755
Sensitivity is : 0.533333333333
Specificity is : 0.778899082569

 
 

For beta : 3.0
Accuracy is : 0.634334763948
Sensitivity is : 0.693333333333
Specificity is : 0.630275229358

 
 

b. Calculate FBeta score on original optimal model {model_logr4}


In [147]:
from sklearn.metrics import fbeta_score

In [177]:
betas = np.linspace(1,3,num=5)
for ta in betas:
    print('\n')
    print('Beta : ', ta)
    fscorema = fbeta_score(y20_test, y_test_pred4, average='macro', beta=ta)
    print('fscore_ma : ' ,fscorema)
    fscoremi = fbeta_score(y20_test, y_test_pred4, average='micro', beta=ta)
    print('fscore_mi : ' ,fscoremi)
    fscorew = fbeta_score(y20_test, y_test_pred4, average='weighted', beta=ta)
    print('fscore_w : ' ,fscorew)
    fscoren = fbeta_score(y20_test, y_test_pred4, average=None, beta=ta)
    print('fscore_n : ' ,fscoren)



Beta :  1.0
fscore_ma :  0.220689655172
fscore_mi :  0.220689655172
fscore_w :  0.220689655172
fscore_n :  [ 0.82110818  0.22068966]


Beta :  1.5
fscore_ma :  0.295035460993
fscore_mi :  0.295035460993
fscore_w :  0.295035460993
fscore_n :  [ 0.77620875  0.29503546]


Beta :  2.0
fscore_ma :  0.363636363636
fscore_mi :  0.363636363636
fscore_w :  0.363636363636
fscore_n :  [ 0.75314618  0.36363636]


Beta :  2.5
fscore_ma :  0.419909502262
fscore_mi :  0.419909502262
fscore_w :  0.419909502262
fscore_n :  [ 0.74046603  0.4199095 ]


Beta :  3.0
fscore_ma :  0.463768115942
fscore_mi :  0.463768115942
fscore_w :  0.463768115942
fscore_n :  [ 0.73292511  0.46376812]

In [165]:
print('fscorema : ' ,fscorema)
print('fscoremi : ' ,fscoremi)
print('fscorew : ' ,fscorew)
print('fscoren : ' ,fscoren)


fscorema :  0.419909502262
fscoremi :  0.419909502262
fscorew :  0.419909502262
fscoren :  [ 0.74046603  0.4199095 ]

Fit the optimized model on actual x,y and predict y from test dataset


In [ ]:
model_logr4.fit(x,y)

In [181]:
prediction = np.where(model_logr4.predict(cd_test)==1,"Yes","No")
submission = pd.DataFrame(list(zip(cd_test['V1'],list(prediction))),
                       columns=['V1','V86'])

In [182]:
pred_y = submission['V86']
actual_y = cd_train['V86']

In [183]:
submission.head(4)


Out[183]:
V1 V86
0 33 No
1 6 Yes
2 39 No
3 9 Yes

In [185]:
submission.to_csv('submission_carvan.csv',index=False)

This submission will get you auc score of approx 0.50, slightly less than whats required for passing the course. You'll have to make changes